Setup

Load libraries

library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)

# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
sequential:
- args: function (expr, envir = parent.frame(), substitute = TRUE, lazy = FALSE, seed = NULL, globals = TRUE, local = TRUE, earlySignal = FALSE, label = NULL, ...)
- tweaked: FALSE
- call: NULL

Process Human Data

import_remote_data <- function(file_url, type = "table", header = FALSE) {
  con <- gzcon(url(file_url))
  txt <- readLines(con)
  if (type == "MM") { return (readMM(textConnection(txt))) }
  if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"

human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
human_ret_seurat
An object of class Seurat 
19712 features across 20091 samples within 1 assay 
Active assay: RNA (19712 features, 0 variable features)

Process Mouse Data

mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data, 
                                       project = "mouse_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
mouse_ret_seurat
An object of class Seurat 
16424 features across 4510 samples within 1 assay 
Active assay: RNA (16424 features, 0 variable features)

Process Primate Data

url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
                auth_token = 'your_token' )
macaque_fovea_seurat
An object of class Seurat 
30039 features across 111993 samples within 1 assay 
Active assay: RNA (30039 features, 0 variable features)

Cleanup

rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)

Combine

# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)

# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
    x <- NormalizeData(x, verbose = FALSE)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})

# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)

Integration

plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50,  anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)

Integrated Analysis

plan("multiprocess", workers = 4)

DefaultAssay(ret.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)

UMAP Visualization

DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")

DimPlot(ret.combined, reduction = "umap", label = TRUE)

DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)

Identify Clusters with Canonical Markers

DefaultAssay(ret.combined) <- "RNA"

features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
                      "Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))

FeaturePlot(object = ret.combined, 
            features = features, 
            pt.size = 0.1,
            cols = c("lightgrey", "#F26969"),
            min.cutoff = "q9",
            combine = TRUE) & NoLegend() & NoAxes()


# Cowplot method: make sure to change to "combine = FALSE" and remove "& NoLegend() & NoAxes"

# for(i in 1:length(p)) {
#   p[[i]] <- p[[i]] + NoLegend() + NoAxes()
# }
# 
# cowplot::plot_grid(plotlist = p, ncol=3)

Markers were determined from this paper and other sources.

Find Differentially Expressed Genes

cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())

cell_type_avg <- function(seurat.combined, ident) {
  cells.x <- subset(seurat.combined, idents = ident)
  Idents(cells.x) <- "orig.ident"
  cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
  cells.x.avg$gene <- rownames(cells.x.avg)
  return(cells.x.avg)
}

cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
  cells.x.avg <- cell_type_avg(ret.combined, ident = x)
  x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
  return(x)
})

# For individual plots
# for (p in cells.plot) {
#   print(p)
# }

# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)

ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"
cells.diffgenes <- as.list(cells.types)
cells.diffgenes <- lapply(cells.diffgenes, FUN = function(x) {
  lab_human <- sprintf("%s_human_ret", x)
  lab_mouse <- sprintf("%s_mouse_ret", x)
  return(FindMarkers(ret.combined, ident.1 = lab_human, ident.2 = lab_mouse))
})


for(i in seq_along(cells.diffgenes)) {
    x <- cells.diffgenes[[i]]
    x <- cbind(x, logp = -log10(x$p_val), types = cells.types[[i]], genes = rownames(x))
    x <- x[!grepl("mt-", x$genes),] # remove mitochondrial genes
    cells.diffgenes[[i]] <- x
    rm(x)
}

Tables with the most differentially expressed genes in each cell subtype:

for(i in seq_along(cells.diffgenes)) {
  print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}

Rod
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
ckb 0 1.4493770 0.918 0.724 0 Inf Rod ckb
hsp90aa1 0 1.3457646 0.854 0.627 0 Inf Rod hsp90aa1
nrl 0 1.3140138 0.874 0.635 0 Inf Rod nrl
0610009b22rik 0 -0.6622860 0.000 0.130 0 Inf Rod 0610009b22rik
gm17018 0 -0.6831275 0.000 0.130 0 Inf Rod gm17018
spata1 0 -0.6929677 0.000 0.132 0 Inf Rod spata1
BC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
neat1 0 3.086391 0.793 0.064 0 Inf BC neat1
mtch1 0 -1.305054 0.000 0.459 0 Inf BC mtch1
selenom 0 -1.338108 0.000 0.480 0 Inf BC selenom
araf 0 -1.342891 0.013 0.494 0 Inf BC araf
klc3 0 -1.424615 0.002 0.500 0 Inf BC klc3
pea15a 0 -1.427543 0.000 0.500 0 Inf BC pea15a
MG
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
tf 0 5.089073 0.962 0.000 0 Inf MG tf
spp1 0 3.879036 0.847 0.003 0 Inf MG spp1
crabp1 0 3.865908 0.876 0.028 0 Inf MG crabp1
gpx3 0 3.736219 0.869 0.052 0 Inf MG gpx3
ftl 0 3.672007 0.877 0.000 0 Inf MG ftl
actg1 0 3.639157 0.905 0.026 0 Inf MG actg1
RGC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
malat1 0e+00 -5.358199 0.000 1.000 0.0000020 10.308117 RGC malat1
gm42418 0e+00 -5.948091 0.000 0.957 0.0000083 9.683676 RGC gm42418
ay036118 0e+00 -2.770196 0.000 0.826 0.0004565 7.944868 RGC ay036118
neat1 0e+00 5.237230 0.889 0.087 0.0007506 7.728937 RGC neat1
linc00599 1e-07 2.669694 0.815 0.000 0.0024960 7.207095 RGC linc00599
kcnq1ot1 1e-07 2.290579 0.926 0.261 0.0025965 7.189955 RGC kcnq1ot1
CC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -5.444663 0.000 1.000 0 80.68864 CC gm42418
malat1 0 -5.893437 0.000 1.000 0 80.68864 CC malat1
ay036118 0 -2.515711 0.000 0.943 0 74.09104 CC ay036118
gnat2 0 -2.832927 0.117 0.943 0 66.80767 CC gnat2
mir124a-1hg 0 -2.420164 0.000 0.869 0 65.97475 CC mir124a-1hg
gngt2 0 -3.274161 0.143 0.937 0 64.88201 CC gngt2
AC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -5.868449 0 1.000 0 112.09869 AC gm42418
malat1 0 -6.139436 0 0.994 0 111.15686 AC malat1
ay036118 0 -2.993976 0 0.935 0 102.83653 AC ay036118
snhg11 0 -3.730037 0 0.916 0 100.12454 AC snhg11
ac149090.1 0 -2.375670 0 0.729 0 75.41475 AC ac149090.1
c230004f18rik 0 -2.693746 0 0.729 0 75.41474 AC c230004f18rik
VC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
hla-b 0 3.429125 0.884 0.00 0 49.06339 VC hla-b
rps3a 0 2.938618 0.826 0.00 0 45.19653 VC rps3a
hla-e 0 3.220179 0.826 0.00 0 45.19650 VC hla-e
hla-a 0 3.030967 0.812 0.00 0 44.24753 VC hla-a
hla-c 0 2.913989 0.797 0.00 0 43.30557 VC hla-c
a2m 0 3.322554 0.797 0.01 0 41.81784 VC a2m
HC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -6.046691 0.000 1.000 0 47.10148 HC gm42418
malat1 0 -5.904074 0.000 0.942 0 43.58186 HC malat1
neat1 0 4.028649 0.967 0.043 0 30.82663 HC neat1
ay036118 0 -2.922930 0.000 0.710 0 30.68474 HC ay036118
pvalb 0 3.437120 0.902 0.000 0 27.66539 HC pvalb
gpi1 0 -2.385417 0.000 0.580 0 24.19037 HC gpi1
M
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
ftl 0 4.612295 0.98 0 0 35.06941 M ftl
hla-dra 0 4.664191 0.94 0 0 33.25514 M hla-dra
hla-a 0 2.899218 0.94 0 0 33.25514 M hla-a
hla-drb1 0 4.096260 0.92 0 0 32.36374 M hla-drb1
rps3a 0 3.397725 0.92 0 0 32.36374 M rps3a
hla-b 0 3.163581 0.92 0 0 32.36374 M hla-b

Save as csv files

for(i in seq_along(cells.diffgenes)) {
  write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
  print(FeaturePlot(object = ret.combined, 
              features = rownames(cells.diffgenes[[i]])[1:genes_to_plot], 
              split.by = "orig.ident", 
              max.cutoff = 3, 
              cols = c("grey", "red"),
              pt.size = 0.07,
              combine = TRUE,
              label.size = 0.5
              ) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
        )
}

Check cell proportion for each species:

knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
human_ret macaque_fovea mouse_ret
0 0.2627047 0.1535810 0.2875831
1 0.5498980 0.0758172 0.3164080
2 0.0001493 0.1855205 0.0002217
3 0.0009955 0.1458216 0.0035477
4 0.0531581 0.0808176 0.0838137
5 0.0114977 0.0783888 0.0388027
6 0.0406650 0.0552267 0.0569845
7 0.0187148 0.0577625 0.0343681
8 0.0484794 0.0443242 0.0917960
9 0.0001493 0.0342075 0.0000000
10 0.0000498 0.0232247 0.0004435
11 0.0076154 0.0156081 0.0152993
12 0.0000000 0.0117775 0.0000000
13 0.0034344 0.0089827 0.0436807
14 0.0000000 0.0109203 0.0000000
15 0.0000000 0.0101435 0.0008869
16 0.0024887 0.0039645 0.0261641
17 0.0000000 0.0039110 0.0000000

Gene Enrichment Analysis

library(ggplot2)
library(ggrepel)
library(scales)
library(data.table)
cells.diffgenes.combined <- rbindlist(cells.diffgenes)

# Preprocessing
# for(i in seq_along(cells.diffgenes.combined)) {
#   if (cells.diffgenes.combined[i]$logp > 750) {
#       cells.diffgenes.combined[i]$logp <- 749
#   }
#   if (cells.diffgenes.combined[i]$avg_logFC > 3) {
#       cells.diffgenes.combined[i]$avg_logFC <- 2.99
#   }
# }
# 
# cells.diffgenes.combined$logp <- gsub("Inf", 749, cells.diffgenes.combined$logp)

ggplot(data=cells.diffgenes.combined, 
           aes(x=avg_logFC,y=logp, colour=types, label = genes)) + 
    geom_point(size=0.2) + 
    theme_bw() + 
    theme(panel.background = element_rect(fill = NA), 
          axis.ticks.x = element_blank(),  
          axis.text.y = element_text(size = 12), 
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank()) + 
    labs(x = "log2(Fold changes)\n(3K/WT)", y ="-log10(p value)") +
    scale_x_continuous(limits=c(-3, 3)) +
    scale_y_continuous(limits=c(1, 300)) +
    geom_hline(yintercept= 1, colour="grey", linetype="dashed", size=0.7 ) +
    geom_vline(xintercept= 0 , colour="grey",   size=0.7)

library(stringr)
plot_enrichment <- function(type = "Rod", info = "") {
    if (info != "") { info.str <- sprintf("_%s", info) }
    else {info.str <- ""}
    file_path <- sprintf("enrich_data/%s%s.txt", type, info.str)
    x <- read.table(file_path, header=T, sep="\t", skip = 11)
    colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
    colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
    colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
    colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
    x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
    x <- x[order(x$FDR),]
    x <- x[1:10,]
    g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) + 
        geom_point(aes(size=Count)) + 
        theme_bw() +
        theme(panel.background = element_rect(fill = NA) , 
              axis.ticks.x=element_blank(), 
              axis.text.y = element_text(size = 12) , 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank()) + 
        labs(x = "Fold Enrichment", y =" ") +
        scale_colour_gradient(low = "red", high = "blue") +
        scale_y_discrete(labels = function(x) str_wrap(x, width = 50)) +
        ggtitle(type, info)
    return(g)
}

for (type in cells.types[-c(6,4)]) {
    print(plot_enrichment(type = type, info = "top200"))
    rm(type)
}

plot_enrichment(type = "RGC")

plot_enrichment(type = "AC")

---
title: "Integrating Primate Data into Analysis"
output: 
  html_notebook:
    toc: true
---
# Setup
Load libraries
```{r message=FALSE, warning=FALSE}
library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)

# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
```
Process Human Data
```{r}
import_remote_data <- function(file_url, type = "table", header = FALSE) {
  con <- gzcon(url(file_url))
  txt <- readLines(con)
  if (type == "MM") { return (readMM(textConnection(txt))) }
  if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"

human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
```
```{r}
rownames(human.count_matrix) <- tolower(human.gene_names[,1])
colnames(human.count_matrix) <- tolower(human.sample_annotations[,1])

human_ret_seurat <- CreateSeuratObject(counts = human.count_matrix, 
                                       meta.data = human.sample_annotations, 
                                       project = "human_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
human_ret_seurat
```

Process Mouse Data
```{r}
mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data, 
                                       project = "mouse_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
mouse_ret_seurat
```

Process Primate Data
```{bash}
url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
```
```{r}
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
                auth_token = 'your_token' )
```
```{r}
load("primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata")

dimnames(Count.mat_fovea)[[1]] <- tolower(dimnames(Count.mat_fovea)[[1]])
macaque_fovea_seurat <- CreateSeuratObject(Count.mat_fovea,
                                           project = "macaque_fovea", 
                                           min.cells = 3, 
                                           min.features = 200)

# give macaque dta uniform name in "orig.ident" metadata column
AddMetaData(macaque_fovea_seurat, 
            metadata = macaque_fovea_seurat[["orig.ident"]], 
            col.name = "orig.sample.name")
macaque_fovea_seurat[["orig.ident"]] <- "macaque_fovea"

macaque_fovea_seurat
```
Cleanup
```{r}
rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)
```


Combine
```{r}
# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)

# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
    x <- NormalizeData(x, verbose = FALSE)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})

# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)
```

# Integration
```{r}
plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50,  anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)
```

# Integrated Analysis
```{r}
plan("multiprocess", workers = 4)

DefaultAssay(ret.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)
```
# UMAP Visualization
```{r warning=FALSE}
DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")
DimPlot(ret.combined, reduction = "umap", label = TRUE)
```
```{r, fig.height = 4, fig.width = 3}
DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)
```

# Identify Clusters with Canonical Markers
```{r}
DefaultAssay(ret.combined) <- "RNA"

features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
                      "Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))

FeaturePlot(object = ret.combined, 
            features = features, 
            pt.size = 0.1,
            cols = c("lightgrey", "#F26969"),
            min.cutoff = "q9",
            combine = TRUE) & NoLegend() & NoAxes()
```

* Rod : pde6a
* AC (amacrine cell) : gad1, slc6a9
* MG (Müller glia) : glul
* BC (bipolar cell) : Trpm, camk2b
* CC (cone cell) : gnat2, arr3
* RGC (retinal ganglial cell) : nefl, thy1
* VC (vascular cell) : mgp, tm4sf1
* M (microglia) : c1qa
* HC (horizontal cell) : sept4

Markers were determined from [this](https://www.nature.com/articles/s41467-019-12780-8) paper and other sources.
```{r}
ret.combined <- RenameIdents(ret.combined, `0` = "MG", `1` = "Rod", `2` = "RGC", 
    `3` = "RGC", `4` = "BC", `5` = "CC", `6` = "BC", `7` = "AC", `8` = "BC", `9` = "RGC", 
    `10` = "RGC", `11`= "HC", `12` = "MG", `13` = "VC", `14` = "RGC", `15` = "RGC", `16` = "M", `17` = "RGC")

DimPlot(ret.combined, label = TRUE)
```


# Find Differentially Expressed Genes
```{r}
cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())

cell_type_avg <- function(seurat.combined, ident) {
  cells.x <- subset(seurat.combined, idents = ident)
  Idents(cells.x) <- "orig.ident"
  cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
  cells.x.avg$gene <- rownames(cells.x.avg)
  return(cells.x.avg)
}

cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
  cells.x.avg <- cell_type_avg(ret.combined, ident = x)
  x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
  return(x)
})

# For individual plots
# for (p in cells.plot) {
#   print(p)
# }

# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)
```
```{r}
ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"
```
```{r}
cells.diffgenes <- as.list(cells.types)
cells.diffgenes <- lapply(cells.diffgenes, FUN = function(x) {
  lab_human <- sprintf("%s_human_ret", x)
  lab_mouse <- sprintf("%s_mouse_ret", x)
  return(FindMarkers(ret.combined, ident.1 = lab_human, ident.2 = lab_mouse))
})


for(i in seq_along(cells.diffgenes)) {
	x <- cells.diffgenes[[i]]
	x <- cbind(x, logp = -log10(x$p_val), types = cells.types[[i]], genes = rownames(x))
	x <- x[!grepl("mt-", x$genes),] # remove mitochondrial genes
	cells.diffgenes[[i]] <- x
	rm(x)
}
```
Tables with the most differentially expressed genes in each cell subtype:
```{r}
for(i in seq_along(cells.diffgenes)) {
  print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}
```
Save as csv files
```{r}
for(i in seq_along(cells.diffgenes)) {
  write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
```

```{r warning=FALSE}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
  print(FeaturePlot(object = ret.combined, 
              features = rownames(cells.diffgenes[[i]])[1:genes_to_plot], 
              split.by = "orig.ident", 
              max.cutoff = 3, 
              cols = c("grey", "red"),
              pt.size = 0.07,
              combine = TRUE,
              label.size = 0.5
              ) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
        )
}
```

Check cell proportion for each species:
```{r}
knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
```

# Gene Enrichment Analysis
```{r warning=FALSE}
library(ggplot2)
library(ggrepel)
library(scales)
library(data.table)
cells.diffgenes.combined <- rbindlist(cells.diffgenes)

# Preprocessing
# for(i in seq_along(cells.diffgenes.combined)) {
# 	if (cells.diffgenes.combined[i]$logp > 750) {
# 		cells.diffgenes.combined[i]$logp <- 749
# 	}
# 	if (cells.diffgenes.combined[i]$avg_logFC > 3) {
# 		cells.diffgenes.combined[i]$avg_logFC <- 2.99
# 	}
# }
# 
# cells.diffgenes.combined$logp <- gsub("Inf", 749, cells.diffgenes.combined$logp)

ggplot(data=cells.diffgenes.combined, 
		   aes(x=avg_logFC,y=logp, colour=types, label = genes)) + 
	geom_point(size=0.2) + 
	theme_bw() + 
	theme(panel.background = element_rect(fill = NA), 
		  axis.ticks.x = element_blank(),  
		  axis.text.y = element_text(size = 12), 
		  panel.grid.major = element_blank(), 
		  panel.grid.minor = element_blank()) + 
	labs(x = "log2(Fold changes)\n(3K/WT)", y ="-log10(p value)") +
	scale_x_continuous(limits=c(-3, 3)) +
	scale_y_continuous(limits=c(1, 300)) +
	geom_hline(yintercept= 1, colour="grey", linetype="dashed", size=0.7 ) +
	geom_vline(xintercept= 0 , colour="grey",   size=0.7)

```

```{r, fig.height = 4, fig.width = 6, dpi = 400, warning=FALSE}
library(stringr)
plot_enrichment <- function(type = "Rod", info = "") {
	if (info != "") { info.str <- sprintf("_%s", info) }
	else {info.str <- ""}
	file_path <- sprintf("enrich_data/%s%s.txt", type, info.str)
	x <- read.table(file_path, header=T, sep="\t", skip = 11)
	colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
	colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
	colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
	colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
	x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
	x <- x[order(x$FDR),]
	x <- x[1:10,]
	g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) + 
		geom_point(aes(size=Count)) + 
		theme_bw() +
		theme(panel.background = element_rect(fill = NA) , 
			  axis.ticks.x=element_blank(), 
			  axis.text.y = element_text(size = 12) , 
			  panel.grid.major = element_blank(), 
			  panel.grid.minor = element_blank()) + 
		labs(x = "Fold Enrichment", y =" ") +
		scale_colour_gradient(low = "red", high = "blue") +
		scale_y_discrete(labels = function(x) str_wrap(x, width = 50)) +
		ggtitle(type, info)
	return(g)
}

for (type in cells.types[-c(6,4)]) {
	print(plot_enrichment(type = type, info = "top200"))
	rm(type)
}
plot_enrichment(type = "RGC")
plot_enrichment(type = "AC")
```

